Breast Cancer Project
Libraries and Functions
library(data.table) # Data table Tools
library(summarytools) # Summary Tools
library(ggplot2) # Plot Visualization Tool
library(data.table) # fread
library(rbokeh) # visualization of data
library(summarytools) # ORIGINALSummary
library(adabag) # bagging and boosting
library(caret) # pre-processing
library(dplyr) # select, mutate_if
library(fastDummies) # dummy_cols
library(splitTools) # data partition
library(rpart) # classification tree
library(rpart.plot) # plot regression trees
library(DT) # datatable
library(corrplot) # corrplot
library(gains) # gain
library(randomForest) # randomForest
library(cluster) # hierarchical clustering
library(knitr) # kable
library(kableExtra) # kbl
library(MASS) # lda, qda, etc.
library(dplyr) # Data Wrangling Tools
library(klaR) # partimat
library(forecast)
library(pROC)
library(tibble)
library(mda) # mda
library(RColorBrewer) # Color Palette
library(tidyverse) # useful Dataframe tools
library(glmnet) # Logistic Lasso Regression
library(car) # VIF
library(ROCR) # ROC Curve
library(neuralnet) # Neural Network
library(nnet) # Neural network
library(factoextra) # K-Means Clustering
library(ggpubr) # ggplot addons
library(GGally) # pairs plots
library (naniar)
library(rmdformats)# Confusion matrix
draw_confusion_matrix <- function(cm, titleaddon = '') {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title(paste0('CONFUSION MATRIX', ' ', titleaddon), cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(5, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(5, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(23, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(23, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(41, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(41, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(59, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(59, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(77, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(77, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
text(95, 85, names(cm$byClass[8]), cex=1.2, font=2)
text(95, 70, round(as.numeric(cm$byClass[8]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
} Data Analysis
ORIGINAL <- fread("data/Breast_Cancer/breast-cancer.csv")Structure of Data
# printing sumamry
print(dfSummary(ORIGINAL, valid.col = FALSE, graph.magnif = 0.75, plain.ascii = FALSE, html = TRUE, style ='grid', silent = TRUE), max.tbl.height = 300, width = 80, method = "render")Data Frame Summary
ORIGINAL
Dimensions: 569 x 32Duplicates: 0
| No | Variable | Stats / Values | Freqs (% of Valid) | Graph | Missing | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | id [integer] |
|
569 distinct values | 0 (0.0%) | |||||||||||
| 2 | diagnosis [character] |
|
|
0 (0.0%) | |||||||||||
| 3 | radius_mean [numeric] |
|
456 distinct values | 0 (0.0%) | |||||||||||
| 4 | texture_mean [numeric] |
|
479 distinct values | 0 (0.0%) | |||||||||||
| 5 | perimeter_mean [numeric] |
|
522 distinct values | 0 (0.0%) | |||||||||||
| 6 | area_mean [numeric] |
|
539 distinct values | 0 (0.0%) | |||||||||||
| 7 | smoothness_mean [numeric] |
|
474 distinct values | 0 (0.0%) | |||||||||||
| 8 | compactness_mean [numeric] |
|
537 distinct values | 0 (0.0%) | |||||||||||
| 9 | concavity_mean [numeric] |
|
537 distinct values | 0 (0.0%) | |||||||||||
| 10 | concave points_mean [numeric] |
|
542 distinct values | 0 (0.0%) | |||||||||||
| 11 | symmetry_mean [numeric] |
|
432 distinct values | 0 (0.0%) | |||||||||||
| 12 | fractal_dimension_mean [numeric] |
|
499 distinct values | 0 (0.0%) | |||||||||||
| 13 | radius_se [numeric] |
|
540 distinct values | 0 (0.0%) | |||||||||||
| 14 | texture_se [numeric] |
|
519 distinct values | 0 (0.0%) | |||||||||||
| 15 | perimeter_se [numeric] |
|
533 distinct values | 0 (0.0%) | |||||||||||
| 16 | area_se [numeric] |
|
528 distinct values | 0 (0.0%) | |||||||||||
| 17 | smoothness_se [numeric] |
|
547 distinct values | 0 (0.0%) | |||||||||||
| 18 | compactness_se [numeric] |
|
541 distinct values | 0 (0.0%) | |||||||||||
| 19 | concavity_se [numeric] |
|
533 distinct values | 0 (0.0%) | |||||||||||
| 20 | concave points_se [numeric] |
|
507 distinct values | 0 (0.0%) | |||||||||||
| 21 | symmetry_se [numeric] |
|
498 distinct values | 0 (0.0%) | |||||||||||
| 22 | fractal_dimension_se [numeric] |
|
545 distinct values | 0 (0.0%) | |||||||||||
| 23 | radius_worst [numeric] |
|
457 distinct values | 0 (0.0%) | |||||||||||
| 24 | texture_worst [numeric] |
|
511 distinct values | 0 (0.0%) | |||||||||||
| 25 | perimeter_worst [numeric] |
|
514 distinct values | 0 (0.0%) | |||||||||||
| 26 | area_worst [numeric] |
|
544 distinct values | 0 (0.0%) | |||||||||||
| 27 | smoothness_worst [numeric] |
|
411 distinct values | 0 (0.0%) | |||||||||||
| 28 | compactness_worst [numeric] |
|
529 distinct values | 0 (0.0%) | |||||||||||
| 29 | concavity_worst [numeric] |
|
539 distinct values | 0 (0.0%) | |||||||||||
| 30 | concave points_worst [numeric] |
|
492 distinct values | 0 (0.0%) | |||||||||||
| 31 | symmetry_worst [numeric] |
|
500 distinct values | 0 (0.0%) | |||||||||||
| 32 | fractal_dimension_worst [numeric] |
|
535 distinct values | 0 (0.0%) |
Generated by summarytools 1.0.1 (R version 4.2.1)
2022-12-20
Missing values
gg_miss_var(ORIGINAL) + ggtitle("NAs")Distribution of Data
norm.value <- preProcess(ORIGINAL, method = c("center", "scale"))
ORIGINAL.boxplot <- predict(norm.value, ORIGINAL)
ORIGINAL.boxplot <- melt(dplyr::select(ORIGINAL.boxplot, -c(id)))
library(ggplot2)
ggplot(ORIGINAL.boxplot, aes(x = diagnosis, y = value)) +
facet_wrap(~variable) +
stat_boxplot(geom ='errorbar') +
geom_boxplot()# histogram over all columns
ggplot(gather(dplyr::select(ORIGINAL, -c(id)), key, value, -diagnosis), aes(value)) +
geom_histogram(bins = 10) +
facet_wrap(~key, scales = 'free')# histogram over all columns grouped by diagnosis
ggplot(gather(dplyr::select(ORIGINAL, -c(id)), key, value, -diagnosis), aes(value, fill = factor(diagnosis))) +
geom_histogram(aes(y = ..density..), alpha = 0.6, position = "identity") +
facet_wrap(~key, scales = 'free') +
ggtitle("Histogram of predictors seperated by class") +
theme(plot.title = element_text(hjust = 0.5)) +
guides(fill=guide_legend(title="Diagnosis"))+
scale_fill_discrete(labels=c('benign', 'malignant'))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Correlation
# Color Palette
library("RColorBrewer")
# select numeric variables
Corr_Data <- na.omit(ORIGINAL[,-c(1,2)])
Corr_plot <- cor(Corr_Data)
# Correlations plotting
corrplot(Corr_plot, method = "color", col=brewer.pal(n=8, name="BuGn"),tl.col="black",tl.srt=45, addCoef.col = "black",number.cex = 1)Pairs plots
predictors means plots:
ggpairs(ORIGINAL, columns = 3:12, aes(color = diagnosis, alpha = 0.5),
diag = list(continuous = "blankDiag"), title = "Pairs plot for Means of Predictors")+
theme(axis.text.x = element_text(angle = 90), axis.title.y.right = element_text(size = 0.4)) # readable x axispredictors SE plots:
ggpairs(ORIGINAL, columns = 13:22, aes(color = diagnosis, alpha = 0.5),
diag = list(continuous = "blankDiag"), title = "Pairs plot for SE of Predictors")+
theme(axis.text.x = element_text(angle = 90), axis.title.y.right = element_text(size = 0.4)) # readable x axispredictors “worst” plots:
ggpairs(ORIGINAL, columns = 23:32, aes(color = diagnosis, alpha = 0.5),
diag = list(continuous = "blankDiag"), title = "Pairs plot for Worsts of Predictors")+
theme(axis.text.x = element_text(angle = 90),axis.title.y.right = element_text(size = 0.4)) # readable x axisData Preparation
Transformation
# Transform Character Format to Binary Numerical Values on Outcome Variable "Diagnosis"
ORIGINAL[diagnosis == "M", c("diagnosis")] <- 1 # 1 for Malign Outcome
ORIGINAL[diagnosis == "B", c("diagnosis")] <- 0 # 0 for Benign Outcome
ORIGINAL$diagnosis <- as.factor(ORIGINAL$diagnosis) # To Factor Variable Partitioning
We partition the data into Training (50%), Validation (30%) and Test (20%)
set.seed(1)
# Splitting each Set from the ORIGINAL Dataset
splitting <- sample(1:3,size=nrow(ORIGINAL),replace=TRUE,prob=c(0.5,0.3,0.2))
Training <- ORIGINAL[splitting==1,]
Validation <- ORIGINAL[splitting==2,]
Test <- ORIGINAL[splitting==3,]
# Checking if proportions are right
Prop_Training <- (nrow(Training)/nrow(ORIGINAL))*100
Prop_Validation <- (nrow(Validation)/nrow(ORIGINAL))*100
Prop_Test <- (nrow(Test)/nrow(ORIGINAL))*100
# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")## [1] "The Proportions are: 52.72 % In Training, 27.94 % In Validation, and 19.33 % In Test"
Supervised Learning
Logistic Regression
Assumptions for Logistic Regression:
- The dependent variable must be categorical in nature.
- The independent variable should not have multi-collinearity.
Type of Logistic Regression:
- Binomial (we will use this type since we only have binary outcomes, malign or benign)
- Multinational
- Ordinal
Fit the Logistic Regression Model
set.seed(1)
# Duplicate the Training and Validation Set
Training_Logistic <- Training
Validation_Logistic <- Validation
Test_Logistic <- Test
# Remove the "ID" Variable
Training_Logistic <- Training_Logistic[,-c("id")]
Validation_Logistic <- Validation_Logistic[,-c("id")]
Test_Logistic <- Test_Logistic[,-c("id")]set.seed(1)
# Fit The Logistic Regression Model
Logistic_Model_1 <- glm(diagnosis ~ ., family=binomial(link='logit'), data=Training_Logistic)
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_1)##
## Call:
## glm(formula = diagnosis ~ ., family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.000070518 -0.000000021 -0.000000021 0.000000021 0.000070585
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -509.1177 2465238.9085 0.000 1.000
## radius_mean -93.2456 323382.8767 0.000 1.000
## texture_mean 2.3094 22523.0765 0.000 1.000
## perimeter_mean -11.6840 52308.6274 0.000 1.000
## area_mean 1.7104 2262.5035 0.001 0.999
## smoothness_mean -685.4357 5319377.9496 0.000 1.000
## compactness_mean -308.1579 3901442.1032 0.000 1.000
## concavity_mean -1285.6905 2106831.7374 -0.001 1.000
## `concave points_mean` 4039.1718 3635907.7100 0.001 0.999
## symmetry_mean 514.8607 1026728.8154 0.001 1.000
## fractal_dimension_mean -1007.7298 10373128.4516 0.000 1.000
## radius_se 471.8883 1265976.6239 0.000 1.000
## texture_se -21.9098 111341.2698 0.000 1.000
## perimeter_se -63.4491 106210.1312 -0.001 1.000
## area_se 0.8631 7763.2004 0.000 1.000
## smoothness_se -17211.4580 12787117.2047 -0.001 0.999
## compactness_se -337.2308 5311033.6271 0.000 1.000
## concavity_se -1468.9225 2776892.5385 -0.001 1.000
## `concave points_se` 19028.2832 11931195.6087 0.002 0.999
## symmetry_se 937.7632 2878359.8525 0.000 1.000
## fractal_dimension_se -29868.7301 27693601.1023 -0.001 0.999
## radius_worst 26.9500 151813.9701 0.000 1.000
## texture_worst 6.1503 18116.1813 0.000 1.000
## perimeter_worst 15.4087 17407.7046 0.001 0.999
## area_worst -1.0357 711.8379 -0.001 0.999
## smoothness_worst 1805.2835 2885425.5533 0.001 1.000
## compactness_worst -396.3357 876270.4511 0.000 1.000
## concavity_worst 687.5920 686549.7089 0.001 0.999
## `concave points_worst` -2016.1383 1948482.2901 -0.001 0.999
## symmetry_worst -188.1559 462821.2593 0.000 1.000
## fractal_dimension_worst 4064.7762 3245370.1644 0.001 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.18719695395 on 299 degrees of freedom
## Residual deviance: 0.00000005134 on 269 degrees of freedom
## AIC: 62
##
## Number of Fisher Scoring iterations: 25
Comments: Trying to fit every variable in our logistic regression showed an convergence error when there is complete separation. To deal with this we should use a penalized model because of too many variables included in our model compared to the number of observations. (See [Convergence Error in Logistic Regression] and [Penalized Logistic Regression Essentials in R: Ridge, Lasso and Elastic Net] in References)
Fit a Penalized Model for Logistic Regression
There is 3 differents methods when it comes to Penalized Logistic Regression Model:
- ridge regression: variables with minor contribution have their coefficients close to zero. However, all the variables are incorporated in the model. This is useful when all variables need to be incorporated in the model according to domain knowledge.
- lasso regression: the coefficients of some less contributive variables are forced to be exactly zero. Only the most significant variables are kept in the final model.
- elastic net regression: the combination of ridge and lasso regression. It shrinks some coefficients toward zero (like ridge regression) and set some coefficients to exactly zero (like lasso regression)
For this case, we will use a Lasso Regression Model being way more strict in attributing less to no weight to variables not significant enough.
# Required Packages
library(tidyverse)
library(caret)
library(glmnet)
# Setting Seed
set.seed(1)
# Define response variable
y_lasso <- as.numeric(Training_Logistic$diagnosis)
# Define matrix of predictor variables
x_lasso <- data.matrix(Training_Logistic[,-c("diagnosis")])
# Perform k-fold cross-validation to find optimal lambda value - alpha = 1 is for using Lasso Method
cv_model <- cv.glmnet(x_lasso, y_lasso, alpha = 1)
# Find optimal lambda value that minimizes test MSE
best_lambda <- cv_model$lambda.min
print(paste("Best Lambda is equal to",best_lambda))## [1] "Best Lambda is equal to 0.000578270146672675"
# Produce plot of test MSE by lambda value
plot(cv_model) Comments: We want to use the lowest MSE and thus find the optimal Lambda.
set.seed(1)
# Use optimal lambda value and alpha = 1 is for using Lasso Method
Logistic_Lasso_Optimal <- glmnet(x_lasso, y_lasso, alpha = 1, lambda = best_lambda)
# Disable Scientific Notation
options(scipen=999)
# Model Summary
Logistic_Lasso_Optimal$beta## 30 x 1 sparse Matrix of class "dgCMatrix"
## s0
## radius_mean .
## texture_mean 0.0092460114
## perimeter_mean 0.0007441971
## area_mean .
## smoothness_mean -0.6726642078
## compactness_mean -3.0655109153
## concavity_mean 2.3768884384
## concave points_mean .
## symmetry_mean 0.6146556627
## fractal_dimension_mean -5.5409126922
## radius_se 0.3947352904
## texture_se 0.0125006149
## perimeter_se 0.0175743608
## area_se -0.0026599944
## smoothness_se 2.4042915125
## compactness_se -2.1473061382
## concavity_se -3.0067441349
## concave points_se 8.7579605809
## symmetry_se 6.9831540153
## fractal_dimension_se -7.8748193831
## radius_worst 0.0873539874
## texture_worst .
## perimeter_worst 0.0010958466
## area_worst -0.0004789178
## smoothness_worst 2.8169401453
## compactness_worst 0.2083513325
## concavity_worst 0.0294152220
## concave points_worst 1.3314761934
## symmetry_worst .
## fractal_dimension_worst 5.2951497825
Comments: We can see that our Logistic Model has shrunk some variables to 0, this can be expected when using Lasso Regression, since it will get rid of unsignificant variables completely instead of setting a very low coefficient.
Logistic Regression using Variables from Lasso Selection
set.seed(1)
# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After <- glm(diagnosis ~ area_mean + smoothness_mean + compactness_mean + concavity_mean + `concave points_mean` + symmetry_mean + fractal_dimension_mean + radius_se + texture_se + perimeter_se + area_se + smoothness_se + concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se + radius_worst + texture_worst + area_worst + concavity_worst + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After)##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + compactness_mean +
## concavity_mean + `concave points_mean` + symmetry_mean +
## fractal_dimension_mean + radius_se + texture_se + perimeter_se +
## area_se + smoothness_se + concavity_se + `concave points_se` +
## symmetry_se + fractal_dimension_se + radius_worst + texture_worst +
## area_worst + concavity_worst + `concave points_worst` + symmetry_worst +
## fractal_dimension_worst, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.000091587 -0.000000021 -0.000000021 0.000000021 0.000095973
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1036.5304 2846587.3158 0.000 1.000
## area_mean -0.1696 1462.2833 0.000 1.000
## smoothness_mean 1450.2601 4192046.3648 0.000 1.000
## compactness_mean -1731.9712 1906426.5985 -0.001 0.999
## concavity_mean 1469.0815 3711329.7662 0.000 1.000
## `concave points_mean` 1070.0724 4578141.0180 0.000 1.000
## symmetry_mean -165.2926 2286527.5901 0.000 1.000
## fractal_dimension_mean -437.0810 14827275.0943 0.000 1.000
## radius_se -7.1344 886865.7546 0.000 1.000
## texture_se -26.0697 118646.6300 0.000 1.000
## perimeter_se 17.0820 55798.5434 0.000 1.000
## area_se 1.1568 9851.6304 0.000 1.000
## smoothness_se -10007.8391 23596748.0400 0.000 1.000
## concavity_se -2628.1095 7354098.2654 0.000 1.000
## `concave points_se` 25297.8713 16023563.8416 0.002 0.999
## symmetry_se -931.2103 8047236.1089 0.000 1.000
## fractal_dimension_se -54837.6628 30037108.6303 -0.002 0.999
## radius_worst 19.9328 175061.2117 0.000 1.000
## texture_worst 6.0563 5777.3810 0.001 0.999
## area_worst 0.1061 2460.1674 0.000 1.000
## concavity_worst 112.5699 552215.1283 0.000 1.000
## `concave points_worst` -1639.6759 2321009.5161 -0.001 0.999
## symmetry_worst 260.3450 802972.6305 0.000 1.000
## fractal_dimension_worst 6959.5789 5101143.4606 0.001 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187196953955 on 299 degrees of freedom
## Residual deviance: 0.000000089786 on 276 degrees of freedom
## AIC: 48
##
## Number of Fisher Scoring iterations: 25
Comments: Eventhough we did a Lasso Regression, we can see that our standard Logistic Regression fails to converge with this selection of variables. This is could be due to the number of variables, 23 in our case and the low number of observations. Now it’s the time to check our VIF, since Multicolinearity could be our main source of problem.
VIF
We need to compute multiple stage when removing our multicolinear Variables, let’s see when we don’t have anymore problem of multicolinearity.
Test For Multicolinearity in Our Dataset using VIF - First Iteration
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values <- vif(Logistic_Model_After)
# Create horizontal bar chart to display each VIF value
barplot(vif_values, main = "VIF Values - First Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)# Call VIF Values
vif_values## area_mean smoothness_mean compactness_mean
## 12711.7372 474.1363 658.4854
## concavity_mean `concave points_mean` symmetry_mean
## 5543.8846 1826.2882 387.8132
## fractal_dimension_mean radius_se texture_se
## 836.5596 3823.2095 637.7963
## perimeter_se area_se smoothness_se
## 603.9070 9076.0715 736.7649
## concavity_se `concave points_se` symmetry_se
## 2048.6454 395.2401 291.1494
## fractal_dimension_se radius_worst texture_worst
## 236.8928 23366.1249 144.7868
## area_worst concavity_worst `concave points_worst`
## 52809.4217 611.4157 691.8691
## symmetry_worst fractal_dimension_worst
## 210.7082 519.5269
Comments: We can see that most of our variables have multicolinearity (with VIF over 5). We need to remove variables with the highest VIF first. We can start by removing at least 8 variables: radius_worst, area_worst, concavity_worst, concavity_mean, concavity_worst, fractal_dimension_se, concavity_se and concave points_mean.
Logistic Regression with VIF removing 8 variables - Second Iteration
set.seed(1)
# Fit The Logistic Regression Model
Logistic_Model_After_VIF1 <- glm(diagnosis ~ area_mean + smoothness_mean + compactness_mean + symmetry_mean + fractal_dimension_mean + radius_se + texture_se + perimeter_se + area_se + smoothness_se + `concave points_se` + symmetry_se + texture_worst + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After_VIF1)##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + compactness_mean +
## symmetry_mean + fractal_dimension_mean + radius_se + texture_se +
## perimeter_se + area_se + smoothness_se + `concave points_se` +
## symmetry_se + texture_worst + `concave points_worst` + symmetry_worst +
## fractal_dimension_worst, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.66215 -0.02679 -0.00130 0.00007 2.74659
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -28.101214 21.470757 -1.309 0.1906
## area_mean 0.005558 0.012446 0.447 0.6552
## smoothness_mean 95.448428 87.774297 1.087 0.2768
## compactness_mean -11.799465 43.355986 -0.272 0.7855
## symmetry_mean -50.291312 44.623905 -1.127 0.2597
## fractal_dimension_mean -170.084912 282.240446 -0.603 0.5468
## radius_se 16.946697 41.812049 0.405 0.6853
## texture_se -0.751951 2.176595 -0.345 0.7297
## perimeter_se -0.005054 1.670765 -0.003 0.9976
## area_se 0.054588 0.459390 0.119 0.9054
## smoothness_se 145.423653 413.481380 0.352 0.7251
## `concave points_se` -111.604244 340.577154 -0.328 0.7431
## symmetry_se -86.243828 166.526520 -0.518 0.6045
## texture_worst 0.412596 0.222259 1.856 0.0634 .
## `concave points_worst` 96.771896 44.386229 2.180 0.0292 *
## symmetry_worst 31.814364 23.021997 1.382 0.1670
## fractal_dimension_worst -11.018661 114.043706 -0.097 0.9230
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 31.902 on 283 degrees of freedom
## AIC: 65.902
##
## Number of Fisher Scoring iterations: 11
Comments: After removing 8 variables, our model still suffer multicolinearity since most variables are not significant, but now the model can converge, let’s check our VIF again.
Test For Multicolinearity in Our Dataset using VIF - Second Iteration
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values_2 <- vif(Logistic_Model_After_VIF1)
# Create horizontal bar chart to display each VIF value
barplot(vif_values_2, main = "VIF Values - Second Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)# Call VIF Values
vif_values_2## area_mean smoothness_mean compactness_mean
## 15.595581 6.145892 10.148517
## symmetry_mean fractal_dimension_mean radius_se
## 5.027208 13.100655 153.870701
## texture_se perimeter_se area_se
## 5.769376 6.997751 151.905108
## smoothness_se `concave points_se` symmetry_se
## 4.069325 12.270014 8.053918
## texture_worst `concave points_worst` symmetry_worst
## 6.419859 7.861339 10.357040
## fractal_dimension_worst
## 11.174660
Comments: We can still see high VIF values in our variables, let’s remove 6 variables again: radius_se, perimeter_se, area_se, compactness_mean, texture_se and texture_worst.
Logistic Regression with VIF removing 6 variables - Third Iteration
set.seed(1)
# Fit The Logistic Regression Model
Logistic_Model_After_VIF2 <- glm(diagnosis ~ area_mean + smoothness_mean + symmetry_mean + fractal_dimension_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After_VIF2)##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean +
## fractal_dimension_mean + smoothness_se + `concave points_se` +
## symmetry_se + `concave points_worst` + symmetry_worst + fractal_dimension_worst,
## family = binomial(link = "logit"), data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6820 -0.0722 -0.0158 0.0093 4.3901
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.896503 7.552690 -1.972 0.048570 *
## area_mean 0.012886 0.003772 3.416 0.000634 ***
## smoothness_mean 48.131843 55.833555 0.862 0.388655
## symmetry_mean -7.054654 30.331490 -0.233 0.816084
## fractal_dimension_mean -261.372561 186.237770 -1.403 0.160487
## smoothness_se 514.036777 280.214593 1.834 0.066589 .
## `concave points_se` -139.375305 216.104775 -0.645 0.518964
## symmetry_se 30.505299 117.585842 0.259 0.795303
## `concave points_worst` 59.599022 23.907022 2.493 0.012669 *
## symmetry_worst 12.023580 16.496592 0.729 0.466092
## fractal_dimension_worst 64.714978 62.539298 1.035 0.300767
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 51.151 on 289 degrees of freedom
## AIC: 73.151
##
## Number of Fisher Scoring iterations: 9
Comments: Now we can see that our Regression Model Converge, let’s compute a third iteration of VIF to check the multicolinearity.
Test For Multicolinearity in Our Dataset using VIF - Third Iteration
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values_3 <- vif(Logistic_Model_After_VIF2)
# Create horizontal bar chart to display each VIF value
barplot(vif_values_3, main = "VIF Values - Third Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)# Call VIF Values
vif_values_3## area_mean smoothness_mean symmetry_mean
## 2.036432 3.309360 3.734544
## fractal_dimension_mean smoothness_se `concave points_se`
## 8.371764 2.751976 5.868334
## symmetry_se `concave points_worst` symmetry_worst
## 5.369938 3.926850 7.396930
## fractal_dimension_worst
## 6.413996
Comments: We are indeed improved our VIF model by excluding a lot of multicolinear variables, we can still see 3 variables suffering from a VIF higher than 5, let’s remove fractal_dimension_mean and see if it improves everything.
Logistic Regression with VIF removing 1 variable - Fourth Iteration
set.seed(1)
# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After_VIF3 <- glm(diagnosis ~ area_mean + smoothness_mean + symmetry_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst`+ symmetry_worst + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After_VIF3)##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean +
## smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` +
## symmetry_worst + fractal_dimension_worst, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5703 -0.0904 -0.0212 0.0091 4.2589
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.569221 5.766024 -3.741 0.000183 ***
## area_mean 0.014251 0.003528 4.039 0.0000536 ***
## smoothness_mean 2.822523 42.528827 0.066 0.947085
## symmetry_mean -8.237752 28.629721 -0.288 0.773550
## smoothness_se 544.620641 260.927403 2.087 0.036866 *
## `concave points_se` -184.081058 189.735324 -0.970 0.331947
## symmetry_se -2.199196 104.630291 -0.021 0.983231
## `concave points_worst` 64.859936 22.683560 2.859 0.004245 **
## symmetry_worst 12.776311 14.687096 0.870 0.384355
## fractal_dimension_worst -2.007970 38.811877 -0.052 0.958739
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 53.299 on 290 degrees of freedom
## AIC: 73.299
##
## Number of Fisher Scoring iterations: 9
Comments: Now we can see that our Regression Model Converge, let’s compute a fourth iteration of VIF to check the multicolinearity.
Test For Multicolinearity in Our Dataset using VIF - Fourth Iteration
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values_4 <- vif(Logistic_Model_After_VIF3)
# Create horizontal bar chart to display each VIF value
barplot(vif_values_4, main = "VIF Values - Fourth Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)# Call VIF Values
vif_values_4## area_mean smoothness_mean symmetry_mean
## 1.974465 2.170228 3.393934
## smoothness_se `concave points_se` symmetry_se
## 2.506280 5.034108 4.507909
## `concave points_worst` symmetry_worst fractal_dimension_worst
## 3.656527 6.429923 2.915146
Comments: We can see that our highest VIF values come from symmetry_worst, we can remove it and check if our model is now free of multicolinearity issues.
Logistic Regression with VIF removing 1 variable - Fifth Iteration (Last)
set.seed(1)
# Fit The Logistic Regression Model with only selected variables from Lasso
Logistic_Model_After_VIF4 <- glm(diagnosis ~ area_mean + smoothness_mean + symmetry_mean + smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` + fractal_dimension_worst , family=binomial(link='logit'), data=Training_Logistic)
# Disable Scientific Notation
options(scipen=999)
# Model Summary
summary(Logistic_Model_After_VIF4)##
## Call:
## glm(formula = diagnosis ~ area_mean + smoothness_mean + symmetry_mean +
## smoothness_se + `concave points_se` + symmetry_se + `concave points_worst` +
## fractal_dimension_worst, family = binomial(link = "logit"),
## data = Training_Logistic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6582 -0.0909 -0.0251 0.0086 4.2428
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -21.031832 5.780075 -3.639 0.000274 ***
## area_mean 0.014467 0.003604 4.014 0.0000597 ***
## smoothness_mean -6.948246 39.846117 -0.174 0.861569
## symmetry_mean 6.581404 23.217996 0.283 0.776823
## smoothness_se 483.403351 217.848490 2.219 0.026487 *
## `concave points_se` -273.861042 131.451779 -2.083 0.037219 *
## symmetry_se 61.479884 66.461799 0.925 0.354945
## `concave points_worst` 71.119350 21.538479 3.302 0.000960 ***
## fractal_dimension_worst 8.358018 35.734350 0.234 0.815068
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 393.187 on 299 degrees of freedom
## Residual deviance: 54.198 on 291 degrees of freedom
## AIC: 72.198
##
## Number of Fisher Scoring iterations: 9
Comments: We improved some significance level removing this high VIF variable.
Test For Multicolinearity in Our Dataset using VIF - Fifth Iteration (Last)
set.seed(1)
# Load the car library
library(car)
# Create vector of VIF values
vif_values_5<- vif(Logistic_Model_After_VIF4)
# Create horizontal bar chart to display each VIF value
barplot(vif_values_5, main = "VIF Values - Fifth Iteration", horiz = FALSE, col = "steelblue", las=2)
# Add vertical line at 5
abline(h = 5, lwd = 3, lty = 2)# Call VIF Values
vif_values_5## area_mean smoothness_mean symmetry_mean
## 2.054457 1.987622 2.246336
## smoothness_se `concave points_se` symmetry_se
## 2.286389 2.614686 2.244157
## `concave points_worst` fractal_dimension_worst
## 3.857056 2.524969
Comments: Now we can see that all our selected variables are not subject to multicolinearity anymore. Let’s use this model to compute some predictions.
Predictions
Logistic Lasso Regression - Predictions and Confusion Matrix on Validation
set.seed(1)
# Predictions with LR
Logistic_Lasso_Predictions <- predict(Logistic_Model_After_VIF4, Validation_Logistic[,c("area_mean", "smoothness_mean", "symmetry_mean", "smoothness_se", "concave points_se", "symmetry_se", "concave points_worst", "fractal_dimension_worst")], type = "response")
# Rounding Predictions - 0.5 Threshold
Logistic_Lasso_Predictions_Dummy <- round(Logistic_Lasso_Predictions)
# As Numeric
Logistic_Lasso_Predictions_Dummy <- as.numeric(Logistic_Lasso_Predictions_Dummy)
# Check rounding in a Dataframe
DF_Logistic_Lasso_Predictions <- cbind(Logistic_Lasso_Predictions, Logistic_Lasso_Predictions_Dummy)
# As Factor
Logistic_Lasso_Predictions_Dummy <- as.factor(Logistic_Lasso_Predictions_Dummy)
# Confusion Matrix
Confusion_Matrix_Logistic_Lasso <- confusionMatrix(data = Logistic_Lasso_Predictions_Dummy, reference = Validation_Logistic$diagnosis, positive = "1")
# Create the Function for Confusion Matrix
draw_confusion_matrix_Logistic_Lasso <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Logistic Regression - Validation', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_Logistic_Lasso(Confusion_Matrix_Logistic_Lasso)Comments:
ROC Curve of the Logistic Regression - Validation
set.seed(1)
# Load ROCR Package
library(ROCR)
# Plot our ROC Curve
pr <- ROCR::prediction(Logistic_Lasso_Predictions, Validation_Logistic$diagnosis)
prf <- ROCR::performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf, main="ROC for Validation Set")
abline(a = 0, b = 1) Comments:
Logistic Lasso Regression - Predictions and Confusion Matrix on Test
set.seed(1)
# Predictions with LR
Logistic_Lasso_Predictions_Test <- predict(Logistic_Model_After_VIF4, Test_Logistic[,c("area_mean", "smoothness_mean", "symmetry_mean", "smoothness_se", "concave points_se", "symmetry_se", "concave points_worst", "fractal_dimension_worst")], type = "response")
# Rounding Predictions - 0.5 Threshold
Logistic_Lasso_Predictions_Dummy_Test <- round(Logistic_Lasso_Predictions_Test)
# As Numeric
Logistic_Lasso_Predictions_Dummy_Test <- as.numeric(Logistic_Lasso_Predictions_Dummy_Test)
# Check rounding in a Dataframe
DF_Logistic_Lasso_Predictions_Test <- cbind(Logistic_Lasso_Predictions_Test, Logistic_Lasso_Predictions_Dummy_Test)
# As Factor
Logistic_Lasso_Predictions_Dummy_Test <- as.factor(Logistic_Lasso_Predictions_Dummy_Test)
# Confusion Matrix
Confusion_Matrix_Logistic_Lasso_Test <- confusionMatrix(data = Logistic_Lasso_Predictions_Dummy_Test, reference = Test_Logistic$diagnosis, positive = "1")
# Create the Function for Confusion Matrix
draw_confusion_matrix_Logistic_Lasso_Test <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Logistic Regression - Test', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_Logistic_Lasso_Test(Confusion_Matrix_Logistic_Lasso_Test)Comments:
Best Logistic Regression
On Validation
# Confusion Best
Best_Logistic_Confusion <- Confusion_Matrix_Logistic_Lasso
# Predictions Best
Best_Logistic_Predictions_Dummy <- Logistic_Lasso_Predictions_Dummy
Best_Logistic_Predictions_Dummy <- factor(Best_Logistic_Predictions_Dummy)
Best_Logistic_Predictions_Probabilities <- Logistic_Lasso_Predictions
# Best Predictions as Data frame
DF_Best_Logistic_Predictions <- data.frame(Best_Logistic_Predictions_Dummy, Best_Logistic_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_Logistic_Confusion <- data.frame(c(Best_Logistic_Confusion$byClass[c(1,2)], Best_Logistic_Confusion$overall[1]))
colnames(DF_Best_Logistic_Confusion) <- "Best Logistic Lasso Regression"
DF_Best_Logistic_Confusion <- t(DF_Best_Logistic_Confusion) # FINAL CONFUSION DATAFRAMEOn Test
# Confusion Best
Best_Logistic_Confusion_Test <- Confusion_Matrix_Logistic_Lasso_Test
# Predictions Best
Best_Logistic_Predictions_Dummy_Test <- Logistic_Lasso_Predictions_Dummy_Test
Best_Logistic_Predictions_Dummy_Test <- factor(Best_Logistic_Predictions_Dummy_Test)
Best_Logistic_Predictions_Probabilities_Test <- Logistic_Lasso_Predictions_Test
# Best Predictions as Data frame
DF_Best_Logistic_Predictions_Test <- data.frame(Best_Logistic_Predictions_Dummy_Test, Best_Logistic_Predictions_Probabilities_Test) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_Logistic_Confusion_Test <- data.frame(c(Best_Logistic_Confusion_Test$byClass[c(1,2)], Best_Logistic_Confusion_Test$overall[1]))
colnames(DF_Best_Logistic_Confusion_Test) <- "Best Logistic Lasso Regression"
DF_Best_Logistic_Confusion_Test <- t(DF_Best_Logistic_Confusion_Test) # FINAL CONFUSION DATAFRAMEClassification Tree
#Re-name the partitions in the data
Training_M <- Training
Validation_M <- Validation
Test_M <- Test
#Take out the id column
Training_M <- data.frame(Training_M[,-c(1)])
Validation_M <- data.frame(Validation_M[,-c(1)])
Test_M<- data.frame(Test_M[, -c(1)])
# Checking if proportions are right
Prop_Training <- (nrow(Training_M)/nrow(ORIGINAL))*100
Prop_Validation <- (nrow(Validation_M)/nrow(ORIGINAL))*100
Prop_Test <- (nrow(Test_M)/nrow(ORIGINAL))*100
# Print Proportion
paste("The Proportions are:", round(Prop_Training,2),"% In Training,",round(Prop_Validation,2),"% In Validation, and ",round(Prop_Test,2),"% In Test")## [1] "The Proportions are: 52.72 % In Training, 27.94 % In Validation, and 19.33 % In Test"
The outcome variable is a binary factor, we model a classification tree. We first run a deep tree, with all the features included. Then proceed to reduce the size of the deeper tree through pruning.
run tree:
set.seed(1)
options(scipen=999)
tree_full <- rpart(diagnosis ~ .,
data = Training_M,
method = "class", # "class" because Y is a binary factor
minbucket = 1,
cp = 0.00001)
# Plot tree
rpart.plot(tree_full, yesno = TRUE, digits =-6)length(tree_full$frame$var[tree_full$frame$var == "<leaf>"]) # End nodes## [1] 9
relevance<-as.data.frame(tree_full$variable.importance) #we get the ranking of the variables by importance
kable(relevance, row.names = T,col.names="Variable Importance")%>% kable_paper("hover", full_width = T) #built table| Variable Importance | |
|---|---|
| perimeter_worst | 107.1041853 |
| radius_worst | 104.4460423 |
| area_worst | 103.3104820 |
| radius_mean | 96.4545551 |
| perimeter_mean | 94.4386002 |
| area_mean | 93.1159458 |
| concave.points_worst | 16.8085026 |
| compactness_worst | 7.2225217 |
| symmetry_worst | 7.2225217 |
| concave.points_mean | 6.9755085 |
| concavity_worst | 5.7780173 |
| texture_mean | 5.2906178 |
| concavity_mean | 5.1375986 |
| texture_worst | 3.9679634 |
| fractal_dimension_mean | 2.6453089 |
| fractal_dimension_worst | 2.6453089 |
| smoothness_mean | 1.9784946 |
| texture_se | 1.4970760 |
| compactness_mean | 1.0338243 |
| concave.points_se | 0.8040856 |
| smoothness_se | 0.7485380 |
| compactness_se | 0.5743468 |
printcp(tree_full, digits = 6) # print complexity value##
## Classification tree:
## rpart(formula = diagnosis ~ ., data = Training_M, method = "class",
## minbucket = 1, cp = 0.00001)
##
## Variables actually used in tree construction:
## [1] concave.points_mean concave.points_worst perimeter_worst
## [4] radius_mean radius_worst smoothness_mean
## [7] texture_mean
##
## Root node error: 109/300 = 0.363333
##
## n= 300
##
## CP nsplit rel error xerror xstd
## 1 0.83486239 0 1.00000000 1.000000 0.0764263
## 2 0.08256881 1 0.16513761 0.266055 0.0469566
## 3 0.01834862 2 0.08256881 0.174312 0.0387028
## 4 0.00917431 4 0.04587156 0.165138 0.0377375
## 5 0.00001000 8 0.00917431 0.192661 0.0405438
plotcp(tree_full, upper = "splits") # we plot the progression of complexity values#Prune the tree
min_xerr<- which.min(tree_full$cptable[,"xerror"]) # select minimum cross-validation error
cp_bp <- tree_full$cptable[min_xerr,"CP"] # find the corresponding CP value, to get the "best pruned " tree
pruned_tree<- prune(tree_full, cp = cp_bp) # re-compute the tree with the selected Cp
rpart.plot(pruned_tree, yesno = TRUE, digits =-3)length(pruned_tree$frame$var[pruned_tree$frame$var == "<leaf>"]) # how many end nodes## [1] 5
The fully grown tree is quite reduced in size, still we proceed with
the pruning. The Best Pruned tree has in total 5 end notes, obtained by
selecting the minimum xerror. We also report the table of
variables in importance.
Performance of Best Pruned Tree
# classification prediction over validation data
pruned_pred <- predict(pruned_tree, Validation_M, type = "class")
pruned_prob <- predict(pruned_tree, Validation_M, type = "prob") # probabilities of belonging to 1
# confusion matrix and accuracy of classification tree
tree_cf<- confusionMatrix(pruned_pred, Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(tree_cf)Sensitivity lower than Specificity ( Malign diagnosis is minority). Accuracy amounts to 0.906, which is expected for such a simple model.
#ROC curve
ROC_df <- data.frame(Validation_M[,1], pruned_pred)
ROC_df[,1]<- as.numeric(as.character(ROC_df[,1]))
ROC_df$pruned_pred<- as.numeric(as.character(ROC_df$pruned_pred))
roc_score <- roc(data= ROC_df , response=Validation_M...1., pruned_pred) #AUC score## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_score ,main ="ROC curve")Specificity is higher than Sensitivity, the model performs better in the categorisation of benign cases.
K-Nearest Neighbor
As K nearest neighbour is based on distances the predictors need to be standardized such that they have a mean at 0 and a variance equal to 1. This necessity of standardizinh is due to the fact that otherwise variables with bigger values have more influence on the distance that is being calculated.
# partition
ORIGINAL.KNN.train <- dplyr::select(Training, -c(id))
ORIGINAL.KNN.valid <- dplyr::select(Validation, -c(id))
ORIGINAL.KNN.test <- dplyr::select(Test, -c(id))
# standardize
norm.value <- preProcess(ORIGINAL.KNN.train, method = c("center", "scale"))
ORIGINAL.KNN.train <- predict(norm.value, ORIGINAL.KNN.train)
ORIGINAL.KNN.valid <- predict(norm.value, ORIGINAL.KNN.valid)
ORIGINAL.KNN.test <- predict(norm.value, ORIGINAL.KNN.test)To get the best k one might iterate over severall k and choose the one which has the highest value for the metric that is being looked at. In this case we looked at either accuracy or sensitivity.
set.seed(1)
df <- data.frame(k = seq(1, 30, 1), accuracy = rep(0, 30), sensitivity = rep(0, 30))
# iterating over different ks
for(i in 1:30){
# nearest neighbor
KNN1 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = i)
# predictions response
KNN1.pred.valid.resp <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")
# predictions prob
KNN1.pred.valid.prob <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]
# Confusionmatrix
df$sensitivity[i] <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")$byClass[1]
df$accuracy[i] <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")$overall[1]
}
# plot the k's
ggplot(df, aes(x=k)) +
geom_line(aes(y = sensitivity, colour = "Sensitivity")) +
geom_line(aes(y = accuracy, colour = "Accuracy")) +
labs(x = "Number of k nearest neighbours",
y = "Accuracy / Sensitivity", title = "Accuracy / Sensitivity regarding k") +
theme_minimal() +
scale_y_continuous(name = "Sensitivity / Accuracy", limits = c(0.7, 1)) +
scale_color_manual(name = "Values", values = c("Sensitivity" = "darkblue", "Accuracy" = "red")) +
xlim (1, 30)From the output we can see that the best k is either a 6 or 1. For accuracy the best k is at 6 while for sensitivity it is at 1 with the second best being at 6. Down below the code for KNN with k=6 and k=1.
set.seed(1)
# nearest neighbor
KNN1 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = 6)
# predictions response
KNN1.pred.valid.resp <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")
# predictions prob
KNN1.pred.valid.prob <- predict(KNN1, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]
# Confusionmatrix Validation
KNN1.conf.mat <- confusionMatrix(KNN1.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")
draw_confusion_matrix(KNN1.conf.mat, titleaddon = "KNN with k=6")When looking at the output we see that 6 nearest neighbors doesn’t deliver a bad model but it lacks sensitivity which is very important in the case of classifying cancer.
Here we see the model with 1 nearest neighbor.
set.seed(1)
# nearest neighbor
KNN2 <- knn3(y = ORIGINAL.KNN.train$diagnosis, x = dplyr::select(ORIGINAL.KNN.train, -c(diagnosis)), k = 1)
# predictions response
KNN2.pred.valid.resp <- predict(KNN2, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "class")
# predictions prob
KNN2.pred.valid.prob <- predict(KNN2, dplyr::select(ORIGINAL.KNN.valid, -c(diagnosis)), type = "prob")[,2]
# Confusionmatrix Validation
KNN2.conf.mat <- confusionMatrix(KNN2.pred.valid.resp, ORIGINAL.KNN.valid$diagnosis, positive = "1")
draw_confusion_matrix(KNN2.conf.mat, titleaddon = 'KNN')We see that this model with 1 nearest neighbor is overall a little better than the model with k equal to 6 and is better in sensitivity. But we will not continue with this model as it isn’t much better than the one with k equal to 6 and in KNN only one neighbor implies overfitting. As we do not want to run this risk we continue with k equal to 6.
Down below we print out again the best model.
# Confusion Best
Best_KNN_Confusion <- KNN1.conf.mat
# Predictions Best
Best_KNN_Predictions_Dummy <- KNN1.pred.valid.resp
Best_KNN_Predictions_Dummy <- factor(Best_KNN_Predictions_Dummy)
Best_KNN_Predictions_Probabilities <- KNN1.pred.valid.prob
# Best Predictions as Data frame
DF_Best_KNN_Predictions <- data.frame(Best_KNN_Predictions_Dummy, Best_KNN_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_KNN_Confusion <- data.frame(c(Best_KNN_Confusion$byClass[c(1,2)], Best_KNN_Confusion$overall[1]))
colnames(DF_Best_KNN_Confusion) <- "Best KNN"
DF_Best_KNN_Confusion <- t(DF_Best_KNN_Confusion) # FINAL CONFUSION DATAFRAMEset.seed(1)
# predictions response
KNN1.pred.test.resp <- predict(KNN1, dplyr::select(ORIGINAL.KNN.test, -c(diagnosis)), type = "class")
# predictions prob
KNN1.pred.test.prob <- predict(KNN1, dplyr::select(ORIGINAL.KNN.test, -c(diagnosis)), type = "prob")[,2]
# Confusionmatrix Validation
KNN1.conf.mat.test <- confusionMatrix(KNN1.pred.test.resp, ORIGINAL.KNN.test$diagnosis, positive = "1")
draw_confusion_matrix(KNN1.conf.mat.test, titleaddon = "KNN for test data with k=6")# Confusion Best
Best_KNN_Confusion_test <- KNN1.conf.mat.test
# Predictions Best
Best_KNN_Predictions_Dummy_test <- KNN1.pred.test.resp
Best_KNN_Predictions_Dummy_test <- factor(Best_KNN_Predictions_Dummy_test)
Best_KNN_Predictions_Probabilities_test <- KNN1.pred.test.prob
# Best Predictions as Data frame
DF_Best_KNN_Predictions_test <- data.frame(Best_KNN_Predictions_Dummy_test, Best_KNN_Predictions_Probabilities_test) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_KNN_Confusion_test <- data.frame(c(Best_KNN_Confusion_test$byClass[c(1,2)], Best_KNN_Confusion_test$overall[1]))
colnames(DF_Best_KNN_Confusion_test) <- "Best KNN test"
DF_Best_KNN_Confusion_test <- t(DF_Best_KNN_Confusion_test) # FINAL CONFUSION DATAFRAMENeural Networks
set.seed(1)
# Duplicate the Training and Validation Set
Training_NN <- Training
Validation_NN <- Validation
Test_NN <- Test
# Make Sure to be as Dataframe
Training_NN <- data.frame(Training_NN)
Validation_NN <- data.frame(Validation_NN)
Test_NN <- data.frame(Test_NN)
# Remove the "ID" Variable
Training_NN <- Training_NN[,-1]
Validation_NN <- Validation_NN[,-1]
Test_NN <- Test_NN[,-1]
# Preprocess Data
Norm_NN <- preProcess(Training_NN, method = c("center", "scale"))
Training_NN_Preprocess <- predict(Norm_NN, Training_NN)
Validation_NN_Preprocess <- predict(Norm_NN, Validation_NN)
Test_NN_Preprocess <- predict(Norm_NN, Test_NN)Neural Network Best Model
Validation
# Confusion Best
Best_Neural_Network_Confusion <- Confusion_Matrix_Neural_2
# Predictions Best
Best_Neural_Network_Predictions_Dummy <- Predictions_NN2_Dummy
Best_Neural_Network_Predictions_Dummy <- factor(Best_Neural_Network_Predictions_Dummy)
Best_Neural_Network_Predictions_Probabilities <- Predictions_NN2_Probabilities
# Best Predictions as Data frame
DF_Best_Neural_Network_Predictions <- data.frame(Best_Neural_Network_Predictions_Dummy, Best_Neural_Network_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_Neural_Confusion <- data.frame(c(Best_Neural_Network_Confusion$byClass[c(1,2)], Best_Neural_Network_Confusion$overall[1]))
colnames(DF_Best_Neural_Confusion) <- "Best Neural Network"
DF_Best_Neural_Confusion <- t(DF_Best_Neural_Confusion) # FINAL CONFUSION DATAFRAMETest
# Confusion Best
Best_Neural_Network_Confusion_Test <- Confusion_Matrix_Neural_2_Test
# Predictions Best
Best_Neural_Network_Predictions_Dummy_Test <- Predictions_NN2_Dummy_Test
Best_Neural_Network_Predictions_Dummy_Test <- factor(Best_Neural_Network_Predictions_Dummy_Test)
Best_Neural_Network_Predictions_Probabilities_Test <- Predictions_NN2_Probabilities_Test
# Best Predictions as Data frame
DF_Best_Neural_Network_Predictions_Test <- data.frame(Best_Neural_Network_Predictions_Dummy_Test, Best_Neural_Network_Predictions_Probabilities_Test) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_Neural_Confusion_Test <- data.frame(c(Best_Neural_Network_Confusion_Test$byClass[c(1,2)], Best_Neural_Network_Confusion_Test$overall[1]))
colnames(DF_Best_Neural_Confusion_Test) <- "Best Neural Network"
DF_Best_Neural_Confusion_Test <- t(DF_Best_Neural_Confusion_Test) # FINAL CONFUSION DATAFRAMEDiscriminant Analysis
To run discriminant analysis the data needs to be centered and scaled. As again otherwise larger values might have abigger influence.
set.seed(1)
# partition
ORIGINAL.DA.train <- dplyr::select(Training, -c(id))
ORIGINAL.DA.valid <- dplyr::select(Validation, -c(id))
ORIGINAL.DA.test <- dplyr::select(Test, -c(id))
# standardize
norm.value <- preProcess(ORIGINAL.DA.train, method = c("center", "scale"))
ORIGINAL.DA.train <- predict(norm.value, ORIGINAL.DA.train)
ORIGINAL.DA.valid <- predict(norm.value, ORIGINAL.DA.valid)
ORIGINAL.DA.test <- predict(norm.value, ORIGINAL.DA.test)First we run a linear discriminant analysis.
set.seed(1)
# Fit the model
DA1 <- lda(diagnosis~., data = ORIGINAL.DA.train)
# Make predictions
predictions <- predict(DA1, ORIGINAL.DA.valid)
# predictions prob
DA1.pred.valid.prob <- predictions$posterior[,2]
# predictions response
DA1.pred.valid.resp <- factor(predictions$class)
# confusion matrix
DA1.conf.mat <- confusionMatrix(DA1.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA1.conf.mat, titleaddon = 'Discriminant Analysis')# Evaluating LDA
# not run because it plots a lot of graphs
# partimat(diagnosis~., data = data.frame(ORIGINAL.DA.train), method="lda", mar=c(0.5, 0.5, 0.5, 0.5))Not a bad model but again it lacks sensitivity. As the model is quite big we can try a variable selection.
set.seed(1)
modelstepL <- stepclass(diagnosis ~ ., "lda", direction = "backward", data = data.frame(ORIGINAL.DA.train))## `stepwise classification', using 10-fold cross-validated correctness rate of method lda'.
## 300 observations of 30 variables in 2 classes; direction: backward
## stop criterion: improvement less than 5%.
## correctness rate: 0.94667; starting variables (30): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.96333; out: "radius_worst"; variables (29): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
##
## hr.elapsed min.elapsed sec.elapsed
## 0.00 0.00 8.17
DA1.sel <- lda(diagnosis ~ radius_mean + texture_mean + perimeter_mean + area_mean +
smoothness_mean + compactness_mean + concavity_mean + `concave points_mean` +
symmetry_mean + fractal_dimension_mean + radius_se + texture_se +
perimeter_se + area_se + smoothness_se + compactness_se +
concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se +
texture_worst + perimeter_worst + area_worst + smoothness_worst +
compactness_worst + concavity_worst + `concave points_worst` +
symmetry_worst + fractal_dimension_worst, data = ORIGINAL.DA.train)
# Make predictions
predictions.sel <- predict(DA1.sel, ORIGINAL.DA.valid)
# predictions prob
DA1.sel.pred.valid.prob <- predictions.sel$posterior[,2]
# predictions response
DA1.sel.pred.valid.resp <- factor(predictions.sel$class)
# confusion matrix
DA1.sel.conf.mat <- confusionMatrix(DA1.sel.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA1.sel.conf.mat, titleaddon = 'Discriminant Analysis')We see that the model only drops one variable and the predictive power of the model doesn’t change much as instead of 1 false positive and 7 false negatives there are now 0 false positives and 8 false negatives.
As we have seen in the data anaylsis there is some correlation in the data why we can try to run a quadratic discriminant analysis.
set.seed(1)
# Fit the model
DA2 <- qda(diagnosis ~., data = ORIGINAL.DA.train)
# Make predictions
predictions <- predict(DA2, ORIGINAL.DA.valid)
# predictions prob
DA2.pred.valid.prob <- predictions$posterior[,2]
# predictions response
DA2.pred.valid.resp <- factor(predictions$class)
# confusion matrix
DA2.conf.mat <- confusionMatrix(DA2.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA2.conf.mat, titleaddon = 'Quadratic Discriminant Analysis')Overall this model is worse than the lda but i case of sensitivtiy it is better as there are only 4 false negatives.
set.seed(1)
modelstepL <- stepclass(diagnosis ~ ., "qda", direction = "backward", data = data.frame(ORIGINAL.DA.train))## `stepwise classification', using 10-fold cross-validated correctness rate of method qda'.
## 300 observations of 30 variables in 2 classes; direction: backward
## stop criterion: improvement less than 5%.
## correctness rate: 0.94667; starting variables (30): radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.95333; out: "area_mean"; variables (29): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.96333; out: "concavity_worst"; variables (28): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.96667; out: "perimeter_worst"; variables (27): radius_mean, texture_mean, perimeter_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.97; out: "perimeter_mean"; variables (26): radius_mean, texture_mean, smoothness_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.97667; out: "smoothness_mean"; variables (25): radius_mean, texture_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
## correctness rate: 0.98; out: "smoothness_se"; variables (24): radius_mean, texture_mean, compactness_mean, concavity_mean, concave.points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, compactness_se, concavity_se, concave.points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, area_worst, smoothness_worst, compactness_worst, concave.points_worst, symmetry_worst, fractal_dimension_worst
##
## hr.elapsed min.elapsed sec.elapsed
## 0.00 0.00 17.47
DA2.sel <- qda(diagnosis ~ radius_mean + texture_mean + compactness_mean + concavity_mean +
`concave points_mean` + symmetry_mean + fractal_dimension_mean +
radius_se + texture_se + perimeter_se + area_se + compactness_se +
concavity_se + `concave points_se` + symmetry_se + fractal_dimension_se +
radius_worst + texture_worst + area_worst + smoothness_worst +
compactness_worst + `concave points_worst` + symmetry_worst +
fractal_dimension_worst, data = ORIGINAL.DA.train)
# Make predictions
predictions <- predict(DA2.sel, ORIGINAL.DA.valid)
# predictions prob
DA2.sel.pred.valid.prob <- predictions$posterior[,2]
# predictions response
DA2.sel.pred.valid.resp <- factor(predictions$class)
# confusion matrix
DA2.sel.conf.mat <- confusionMatrix(DA2.sel.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA2.sel.conf.mat, titleaddon = 'Quadratic Discriminant Analysis')This time around 6 variables were dropped but we see that the model is not better than the qda with all predictors.
There are several more discriminant analysis methods that can be applied. Down below we tried Mixture discriminant analysis (MDA) which often outperforms QDA and LDA because the assumptions for the distributions of the classes are loser than for lda and qda.
set.seed(1)
# Fit the model
DA4 <- mda(diagnosis~., data = ORIGINAL.DA.train)
# predictions prob
DA4.pred.valid.prob <- predict(DA4, ORIGINAL.DA.valid, type = "posterior")[,2]
# predictions response
DA4.pred.valid.resp <- factor(ifelse(DA4.pred.valid.prob > 0.5, 1, 0))
# confusion matrix
DA4.conf.mat <- confusionMatrix(DA4.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA4.conf.mat, 'Mixture discriminant analysis')We see that this method in general works better than the lda and qda. In this case we don’t proceed with this model as it isn’t better in sensitivity which is what we want when predicting cancer.
Down below we run a flexible discriminant analysis (fda) which is an extension of lda using non-linear combinations of predictors (splines)
set.seed(1)
# Fit the model
DA5 <- fda(diagnosis~., data = ORIGINAL.DA.train)
# predictions prob
DA5.pred.valid.prob <- predict(DA5, ORIGINAL.DA.valid, type = "posterior")[,2]
# predictions response
DA5.pred.valid.resp <- factor(ifelse(DA5.pred.valid.prob > 0.5, 1, 0))
# confusion matrix
DA5.conf.mat <- confusionMatrix(DA5.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA5.conf.mat, titleaddon = 'Flexible discriminant analysis')We see that the model has a good accuracy but isn’t good in sensitivity. Therefore we don’t use it further.
Lastly we fit a regularized discriminant analysis (RDA) which is a trade off between qda and lda.
set.seed(1)
# Fit the model
DA5 <- rda(diagnosis~., data = data.frame(ORIGINAL.DA.train))
# predictions prob
DA5.pred.valid.prob <- predict(DA5, data.frame(ORIGINAL.DA.valid))$posterior[,2]
# predictions response
DA5.pred.valid.resp <- factor(ifelse(DA5.pred.valid.prob > 0.5, 1, 0))
# confusion matrix
DA5.conf.mat <- confusionMatrix(DA5.pred.valid.resp, ORIGINAL.DA.valid$diagnosis, positive = "1")
draw_confusion_matrix(DA5.conf.mat, titleaddon = 'Regularized discriminant analysis')This model isn’t better than any other model therefore we don’t use it further.
As we have seen the highest sensitvity was in qda with acceptable accuracy we chose this model as the best one.
# Confusion Best
Best_DA_Confusion <- DA2.conf.mat
# Predictions Best
Best_DA_Predictions_Dummy <- DA2.pred.valid.resp
Best_DA_Predictions_Dummy <- factor(Best_DA_Predictions_Dummy)
Best_DA_Predictions_Probabilities <- DA2.pred.valid.prob
# Best Predictions as Data frame
DF_Best_DA_Predictions <- data.frame(Best_DA_Predictions_Dummy, Best_DA_Predictions_Probabilities) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_DA_Confusion <- data.frame(c(Best_DA_Confusion$byClass[c(1,2)], Best_DA_Confusion$overall[1]))
colnames(DF_Best_DA_Confusion) <- "Best DA"
DF_Best_DA_Confusion <- t(DF_Best_DA_Confusion) # FINAL CONFUSION DATAFRAMEset.seed(1)
# Make predictions
predictions <- predict(DA2, ORIGINAL.DA.test)
# predictions prob
DA2.pred.test.prob <- predictions$posterior[,2]
# predictions response
DA2.pred.test.resp <- factor(predictions$class)
# confusion matrix
DA2.conf.mat.test <- confusionMatrix(DA2.pred.test.resp, ORIGINAL.DA.test$diagnosis, positive = "1")
draw_confusion_matrix(DA2.conf.mat.test, titleaddon = 'Quadratic Discriminant Analysis')# Confusion Best
Best_DA_Confusion_test <- DA2.conf.mat.test
# Predictions Best
Best_DA_Predictions_Dummy_test <- DA2.pred.test.resp
Best_DA_Predictions_Dummy_test <- factor(Best_DA_Predictions_Dummy_test)
Best_DA_Predictions_Probabilities_test <- DA2.pred.test.prob
# Best Predictions as Data frame
DF_Best_DA_Predictions_test <- data.frame(Best_DA_Predictions_Dummy_test, Best_DA_Predictions_Probabilities_test) # FINAL PREDICTIONS DATAFRAME
# Best Confusion as Data frame
DF_Best_DA_Confusion_test <- data.frame(c(Best_DA_Confusion_test$byClass[c(1,2)], Best_DA_Confusion_test$overall[1]))
colnames(DF_Best_DA_Confusion_test) <- "Best DA test"
DF_Best_DA_Confusion_test <- t(DF_Best_DA_Confusion_test) # FINAL CONFUSION DATAFRAMEEnsemble Methods
Bagging
set.seed(1)
bagging<- bagging(diagnosis ~ ., data =Training_M)
bag_pred<- predict(bagging, Validation_M, type="class")
bag_cf <- confusionMatrix(as.factor(bag_pred$class), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(bag_cf)Boosting
set.seed(1)
boosting <- boosting(diagnosis ~ ., data = Training_M)
boost_pred<- predict(boosting, Validation_M, type="class")
boost_cf <- confusionMatrix(as.factor(boost_pred$class), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(boost_cf)Random Forests
set.seed(1)
rand_f <- randomForest(diagnosis ~ ., data = Training_M, mtry=4, importance = T)
varImpPlot(rand_f, type=1,cex = 0.7) # we print out the variable importance plot toorf_pred<- predict(rand_f, Validation_M, type="class")
rf_cf <- confusionMatrix(as.factor(rf_pred), Validation_M$diagnosis, positive = "1")
draw_confusion_matrix(rf_cf)The model performing best is the Boosted Trees model. They all provide improved classification capacity than the orginal Best Pruned Tree.
ROC curves
# For the boosting
response_boost <- data.frame(Validation_M[,1], boost_pred$class)
response_boost$Validation_M...1.<- as.numeric(as.character(response_boost[,1]))
response_boost$boost_pred.class<- as.numeric(as.character(response_boost[,2]))
roc_score_boost <- roc(data= response_boost , response=Validation_M...1., boost_pred.class) #AUC score## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# For bagging
response_bag <- data.frame(Validation_M[,1], bag_pred$class)
response_bag$Validation_M...1.<- as.numeric(as.character(response_bag[,1]))
response_bag$bag_pred.class<- as.numeric(as.character(response_bag[,2]))
roc_score_bag <- roc(data= response_bag , response=Validation_M...1., bag_pred.class) #AUC score## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# For Random Forests
response_rf <- data.frame(Validation_M[,1], rf_pred)
response_rf$Validation_M...1.<- as.numeric(as.character(response_rf[,1]))
response_rf$rf_pred<- as.numeric(as.character(response_rf[,2]))
roc_score_rf <- roc(data= response_rf , response=Validation_M...1., rf_pred) #AUC score## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC curves side by side
par(mfrow=c(1,3))
plot(roc_score_boost ,main ="ROC curve for Boosting", cex= 1)
plot(roc_score_bag ,main ="ROC curve for Bagging", cex=1)
plot(roc_score_rf ,main ="ROC curve for Random Forests", cex=1)Best Model - Boosting Trees
# Best tree model is the boosted tree
# Best tree confusion matrix
best_boosted_confusion <- boost_cf
#predictions for best confusion matrix
best_boosted_pred <- boost_pred
# Probabilities of predictions
best_boosted_prob<- predict(boosting, Validation_M, type="prob")
# Prediction and probabilities data-frame
DF_best_boosted_pred <- data.frame(best_boosted_pred$class,best_boosted_prob$prob[,2])
#table of Confusion matrix values
DF_best_boosted_confusion <- data.frame(c(best_boosted_confusion$byClass[c(1,2)], best_boosted_confusion$overall[1]))
colnames(DF_best_boosted_confusion) <- "Boosted Tree Model"
DF_best_boosted_confusion<- t(DF_best_boosted_confusion)Best Model - Boosting Tree on Test
# Boosted Trees prediction on Test set for Average Ensemble
Boosted_Pred_Test<- predict(boosting, Test_M, type="class")
Boosted_Prob_Test<- predict(boosting, Test_M, type="prob")
# Confusion Matrix for Class predictions on Test set:
Confusion_Boosted_Test<- confusionMatrix(as.factor(Boosted_Pred_Test$class), Test_M$diagnosis, positive = "1")
draw_confusion_matrix(Confusion_Boosted_Test)# Prediction and probabilities data-frame
DF_Test_boosted_pred <- data.frame(Boosted_Pred_Test$class,Boosted_Prob_Test$prob[,2])
#table of Confusion matrix values
DF_Boosted_Confusion_Test <- data.frame(c(Confusion_Boosted_Test$byClass[c(1,2)], Confusion_Boosted_Test$overall[1]))
colnames(DF_Boosted_Confusion_Test) <- "Boosted Tree Model on Test"
DF_Boosted_Confusion_Test<- t(DF_Boosted_Confusion_Test)We predict the performance of Boosted Trees on the test set so to then recompute the Average Ensemble
Best Models
Best 5 Models - On Validation
# Best Logistic Regression
DF_Best_Logistic_Predictions
DF_Best_Logistic_Confusion
# As Numeric
DF_Best_Logistic_Predictions[,1] <- as.numeric(as.character(DF_Best_Logistic_Predictions[,1]))
DF_Best_Logistic_Predictions[,2] <- as.numeric(DF_Best_Logistic_Predictions[,2])
# Best K-Nearest Neighbor
DF_Best_KNN_Predictions
DF_Best_KNN_Confusion
# As Numeric
DF_Best_KNN_Predictions[,1] <- as.numeric(as.character(DF_Best_KNN_Predictions[,1]))
DF_Best_KNN_Predictions[,2] <- as.numeric(DF_Best_KNN_Predictions[,2])
# Best Boosting Trees
DF_best_boosted_pred
DF_best_boosted_confusion
# As Numeric
DF_best_boosted_pred[,1] <- as.numeric(as.character(DF_best_boosted_pred[,1]))
DF_best_boosted_pred[,2] <- as.numeric(DF_best_boosted_pred[,2])
# Discriminant Analysis
DF_Best_DA_Predictions
DF_Best_DA_Confusion
# As Numeric
DF_Best_DA_Predictions[,1] <- as.numeric(as.character(DF_Best_DA_Predictions[,1]))
DF_Best_DA_Predictions[,2] <- as.numeric(DF_Best_DA_Predictions[,2])
# Best Neural Network
DF_Best_Neural_Network_Predictions
DF_Best_Neural_Confusion
# As Numeric
DF_Best_Neural_Network_Predictions[,1] <- as.numeric(as.character(DF_Best_Neural_Network_Predictions[,1]))
DF_Best_Neural_Network_Predictions[,2] <- as.numeric(DF_Best_Neural_Network_Predictions[,2])Majority Vote
Majority_DF <- data.frame(DF_Best_Logistic_Predictions[,1], DF_Best_KNN_Predictions[,1], DF_best_boosted_pred[,1], DF_Best_DA_Predictions[,1], DF_Best_Neural_Network_Predictions[,1])
for(i in 1:nrow(Majority_DF)){
if(sum(Majority_DF[i,1] + Majority_DF[i,2] + Majority_DF[i,3] + Majority_DF[i,4] + Majority_DF[i,5])/5 > 0.5){
Majority_DF[i,6] = 1
}else{Majority_DF[i,6] = 0}
}
colnames(Majority_DF) <- c("Logistic Regression","K-Nearest Neighbor", "Boosted Trees","Discriminant Analysis", "Neural Network","Majority Vote")
Majority_DF$'Validation Actual' <- as.numeric(as.character(Validation$diagnosis))
DT::datatable(Majority_DF, caption = "Best 5 Models on Valildation - Majority of Votes") Average of Models Probabilities
Average_DF <- data.frame(DF_Best_Logistic_Predictions[,2], DF_Best_KNN_Predictions[,2], DF_best_boosted_pred[,2], DF_Best_DA_Predictions[,2], DF_Best_Neural_Network_Predictions[,2])
for(i in 1:nrow(Average_DF)){
Average_DF[i,6] <- sum(Average_DF[i,1] + Average_DF[i,2] + Average_DF[i,3] + Average_DF[i,4] + Average_DF[i,5])/5
}
colnames(Average_DF) <- c("Logistic Regression","K-Nearest Neighbor", "Boosted Trees","Discriminant Analysis", "Neural Network","Average")
Average_DF$'Average Cutoff 0.5' <- ifelse(Average_DF$`Average`>0.5,1,0)
Average_DF$'Validation Actual' <- as.numeric(as.character(Validation$diagnosis))
DT::datatable(round(Average_DF,4), caption = "Best 5 Models on Validation - Average of Probabilities") Confusion Matrix of Majority and Average
# Majority vote Confusion Matrix
Majority_factor <- data.frame(as.factor(Majority_DF[,6]), as.factor(Majority_DF[,7]))
Majority_confusion <- confusionMatrix(Majority_factor[,1], Majority_factor[,2], positive = "1")
draw_confusion_matrix_Majority_Validation <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Majority Vote - Validation', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix_Majority_Validation(Majority_confusion)# Average Confusion Matrix
Average_factor <- data.frame(as.factor(Average_DF[,7]), as.factor(Average_DF[,8]))
Average_confusion <- confusionMatrix(Average_factor[,1], Average_factor[,2], positive = "1")
draw_confusion_matrix_Average_Validation <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Average Probabilities - Validation', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix_Average_Validation(Average_confusion)# Majority Confusion as Data frame
DF_Majority_Confusion <- data.frame(c(Majority_confusion$byClass[c(1,2)], Majority_confusion$overall[1]))
colnames(DF_Majority_Confusion) <- "Majority Vote"
DF_Majority_Confusion <- t(DF_Majority_Confusion) # FINAL CONFUSION DATAFRAME
# Average Confusion as Data frame
DF_Average_Confusion <- data.frame(c(Average_confusion$byClass[c(1,2)], Average_confusion$overall[1]))
colnames(DF_Average_Confusion) <- "Average Probabilities"
DF_Average_Confusion <- t(DF_Average_Confusion) # FINAL CONFUSION DATAFRAMEComments: the Average seems to be better in Accuracy and Sensitivity.
Table of All 7 Models - On Validation
TABLE_7_MODELS <- rbind(DF_Best_Logistic_Confusion, DF_Best_KNN_Confusion, DF_best_boosted_confusion, DF_Best_DA_Confusion, DF_Best_Neural_Confusion, DF_Majority_Confusion, DF_Average_Confusion)
TABLE_7_MODELS <- as.data.frame(TABLE_7_MODELS)
TABLE_7_MODELS <- TABLE_7_MODELS %>% arrange(-Sensitivity)
DT::datatable(round(TABLE_7_MODELS,4), caption = "7 Models on Validation - Order by Sensitivity") Comments: 3 Best Models according to both Sensitivity and Accuracy are: Neural Network, DA and Average Probabilities on Validation.
Best 5 Models - On Test
# Best Logistic Regression
DF_Best_Logistic_Predictions_Test
DF_Best_Logistic_Confusion_Test
# As Numeric
DF_Best_Logistic_Predictions_Test[,1] <- as.numeric(as.character(DF_Best_Logistic_Predictions_Test[,1]))
DF_Best_Logistic_Predictions_Test[,2] <- as.numeric(DF_Best_Logistic_Predictions_Test[,2])
# Best K-Nearest Neighbor
DF_Best_KNN_Predictions_test
DF_Best_KNN_Confusion_test
# As Numeric
DF_Best_KNN_Predictions_test[,1] <- as.numeric(as.character(DF_Best_KNN_Predictions_test[,1]))
DF_Best_KNN_Predictions_test[,2] <- as.numeric(DF_Best_KNN_Predictions_test[,2])
# Best Boosting Trees
DF_Test_boosted_pred
DF_Boosted_Confusion_Test
# As Numeric
DF_Test_boosted_pred[,1] <- as.numeric(as.character(DF_Test_boosted_pred[,1]))
DF_Test_boosted_pred[,2] <- as.numeric(DF_Test_boosted_pred[,2])
# Discriminant Analysis
DF_Best_DA_Predictions_test
DF_Best_DA_Confusion_test
# As Numeric
DF_Best_DA_Predictions_test[,1] <- as.numeric(as.character(DF_Best_DA_Predictions_test[,1]))
DF_Best_DA_Predictions_test[,2] <- as.numeric(DF_Best_DA_Predictions_test[,2])
# Best Neural Network
DF_Best_Neural_Network_Predictions_Test
DF_Best_Neural_Confusion_Test
# As Numeric
DF_Best_Neural_Network_Predictions_Test[,1] <- as.numeric(as.character(DF_Best_Neural_Network_Predictions_Test[,1]))
DF_Best_Neural_Network_Predictions_Test[,2] <- as.numeric(DF_Best_Neural_Network_Predictions_Test[,2])Majority Vote
Majority_DF_Test <- data.frame(DF_Best_Logistic_Predictions_Test[,1], DF_Best_KNN_Predictions_test[,1], DF_Test_boosted_pred[,1], DF_Best_DA_Predictions_test[,1], DF_Best_Neural_Network_Predictions_Test[,1])
for(i in 1:nrow(Majority_DF_Test)){
if(sum(Majority_DF_Test[i,1] + Majority_DF_Test[i,2] + Majority_DF_Test[i,3] + Majority_DF_Test[i,4] + Majority_DF_Test[i,5])/5 > 0.5){
Majority_DF_Test[i,6] = 1
}else{Majority_DF_Test[i,6] = 0}
}
colnames(Majority_DF_Test) <- c("Logistic Regression","K-Nearest Neighbor", "Boosted Trees","Discriminant Analysis", "Neural Network","Majority Vote")
Majority_DF_Test$'Test Actual' <- as.numeric(as.character(Test$diagnosis))
DT::datatable(Majority_DF_Test, caption = "Best 5 Models on Test - Majority of Votes") Average of Models Probabilities
Average_DF_Test <- data.frame(DF_Best_Logistic_Predictions_Test[,2], DF_Best_KNN_Predictions_test[,2], DF_Test_boosted_pred[,2], DF_Best_DA_Predictions_test[,2], DF_Best_Neural_Network_Predictions_Test[,2])
for(i in 1:nrow(Average_DF_Test)){
Average_DF_Test[i,6] <- sum(Average_DF_Test[i,1] + Average_DF_Test[i,2] + Average_DF_Test[i,3] + Average_DF_Test[i,4] + Average_DF_Test[i,5])/5
}
colnames(Average_DF_Test) <- c("Logistic Regression","K-Nearest Neighbor", "Boosted Trees","Discriminant Analysis", "Neural Network","Average")
Average_DF_Test$'Average Cutoff 0.5' <- ifelse(Average_DF_Test$`Average`>0.5,1,0)
Average_DF_Test$'Test Actual' <- as.numeric(as.character(Test$diagnosis))
DT::datatable(round(Average_DF_Test,4), caption = "Best 5 Models on Test - Average of Probabilities") Confusion Matrix of Majority and Average
# Majority vote Confusion Matrix
Majority_factor_Test <- data.frame(as.factor(Majority_DF_Test[,6]), as.factor(Majority_DF_Test[,7]))
Majority_confusion_Test <- confusionMatrix(Majority_factor_Test[,1], Majority_factor_Test[,2], positive = "1")
draw_confusion_matrix_Majority_Test <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Majority Vote - Test', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix_Majority_Test(Majority_confusion_Test)# Average Confusion Matrix
Average_factor_Test <- data.frame(as.factor(Average_DF_Test[,7]), as.factor(Average_DF_Test[,8]))
Average_confusion_Test <- confusionMatrix(Average_factor_Test[,1], Average_factor_Test[,2], positive = "1")
draw_confusion_matrix_Average_Test <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for Average Probabilities - Test', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
draw_confusion_matrix_Average_Test(Average_confusion_Test)# Majority Confusion as Data frame
DF_Majority_Confusion_Test <- data.frame(c(Majority_confusion_Test$byClass[c(1,2)], Majority_confusion_Test$overall[1]))
colnames(DF_Majority_Confusion_Test) <- "Majority Vote"
DF_Majority_Confusion_Test <- t(DF_Majority_Confusion_Test) # FINAL CONFUSION DATAFRAME
# Average Confusion as Data frame
DF_Average_Confusion_Test <- data.frame(c(Average_confusion_Test$byClass[c(1,2)], Average_confusion_Test$overall[1]))
colnames(DF_Average_Confusion_Test) <- "Average Probabilities"
DF_Average_Confusion_Test <- t(DF_Average_Confusion_Test) # FINAL CONFUSION DATAFRAMEComments:
Table of All 7 Models - On Test
TABLE_7_MODELS_Test <- rbind(DF_Best_Logistic_Confusion_Test, DF_Best_KNN_Confusion_test, DF_Boosted_Confusion_Test, DF_Best_DA_Confusion_test, DF_Best_Neural_Confusion_Test, DF_Majority_Confusion_Test, DF_Average_Confusion_Test)
TABLE_7_MODELS_Test <- as.data.frame(TABLE_7_MODELS_Test)
TABLE_7_MODELS_Test <- TABLE_7_MODELS_Test %>% arrange(-Sensitivity)
DT::datatable(round(TABLE_7_MODELS_Test,4), caption = "7 Models on Test - Order by Sensitivity") Comments:
Unsupervised Learning
Cluster Analysis
K-Means Clustering
set.seed(1)
# Duplicate Original to a Cluster Dataframe
KClusteringDF <- ORIGINAL
# Make Sure to be as Dataframe
KClusteringDF <- data.frame(KClusteringDF)
# Remove the "ID" Variable
KClusteringDF <- KClusteringDF[,-1]
# Preprocess Data
Norm_Kmeans <- preProcess(KClusteringDF, method = c("center", "scale"))
KClusteringDF_Preprocess <- predict(Norm_Kmeans, KClusteringDF)
# Separate Benign and Malign into 2 Datasets
Benign_ClusterDF <- KClusteringDF_Preprocess[KClusteringDF_Preprocess$diagnosis == 0,]
Malign_ClusterDF <- KClusteringDF_Preprocess[KClusteringDF_Preprocess$diagnosis == 1,]
# Without Preprocess for Malign DF
Malign_ClusterDF_No_Scale <- KClusteringDF[KClusteringDF$diagnosis == 1,]K-Means Clustering with all Dataset - Model 1
Let’s check if accounting for the whole dataset, we can find meaningful clusters.
set.seed(1)
# Load Library
library(factoextra)
# Labeling Tumors Type as Row Name
KClusteringDF_Preprocess$diagnosis <- factor(KClusteringDF_Preprocess$diagnosis, levels = c(0,1), labels=c("Benign","Malign"))
rownames(KClusteringDF_Preprocess) <- paste(KClusteringDF_Preprocess$diagnosis, 1:dim(KClusteringDF_Preprocess)[1], sep = "_")
# Optimal Number of Clusters
fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "wss")fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "silhouette")fviz_nbclust(KClusteringDF_Preprocess[,-1], kmeans, method = "gap_stat")# Create Clusters
Cluster_ALL <- kmeans(KClusteringDF_Preprocess[,-1], centers = 2, iter.max = 100, nstart = 100)Comments: We can find the optimal number of clusters with 3 differents methods: WSS ( Within-Cluster-Sum of Squared Errors) or also called Elbow Method, The Silhouette Method which accounts for the separation between clusters or lastly the Gap Statistic. Here all three methods give us the optimal number of 2 clusters, which make sense when taking into accounts the fact that there is either Benign or Malign type of tumors. Let’s see it graphically and how it performs with the real word.
Model 1 - Plot
set.seed(1)
# Plotting Clusters of Model 1
fviz_cluster(Cluster_ALL, data = KClusteringDF_Preprocess[,-1], main="Cluster Model 1", labelsize = 0)+ geom_text(
label=rownames(KClusteringDF_Preprocess),
nudge_x = 0.25, nudge_y = 0.25,
check_overlap = T, size=2)Comments: We can see that K-Means without any human intervention found 2 clusters to be optimal on the whole dataset, and separated benign and malign tumors accordingly. We could check how it performed. Here Cluster Number 1 would be the Malign Tumors and Cluser Number 2 the Benign Tumors.
Model 1 - Performance
set.seed(1)
# Let's Convert some results to comparable clusters and values
DF_Cluster_Performance <- as.data.frame(Cluster_ALL$cluster)
DF_Cluster_Performance <- cbind(DF_Cluster_Performance, KClusteringDF_Preprocess$diagnosis)
rownames(DF_Cluster_Performance) <- c(1:dim(DF_Cluster_Performance))
DF_Cluster_Performance$`Cluster_ALL$cluster` <- ifelse(DF_Cluster_Performance$`Cluster_ALL$cluster` == 1, 1,0)
DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` <- ifelse(DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` == "Malign",1,0)
# Convert as Factor the Binary Outcomes
DF_Cluster_Performance$`Cluster_ALL$cluster` <- factor(DF_Cluster_Performance$`Cluster_ALL$cluster`)
DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis` <- factor(DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis`)
# Confusion Matrix
Confusion_Matrix_K_Means1 <- confusionMatrix(data = DF_Cluster_Performance$`Cluster_ALL$cluster`, reference = DF_Cluster_Performance$`KClusteringDF_Preprocess$diagnosis`,positive = "1")
# Create the Function for Confusion Matrix
draw_confusion_matrix_K_Means1 <- function(cm) {
layout(matrix(c(1,1,2)))
par(mar=c(2,2,2,2))
plot(c(100, 345), c(300, 450), type = "n", xlab="", ylab="", xaxt='n', yaxt='n')
title('CONFUSION MATRIX for K-Means - Model 1', cex.main=2)
# create the matrix
rect(150, 430, 240, 370, col='#1c6155')
text(195, 435, 'Benign', cex=1.2)
rect(250, 430, 340, 370, col='#1c615570')
text(295, 435, 'Malignant', cex=1.2)
text(125, 370, 'Predicted', cex=1.3, srt=90, font=2)
text(245, 450, 'Actual', cex=1.3, font=2)
rect(150, 305, 240, 365, col='#1c615570')
rect(250, 305, 340, 365, col='#1c6155')
text(140, 400, 'Benign', cex=1.2, srt=90)
text(140, 335, 'Malignant', cex=1.2, srt=90)
# add in the cm results
res <- as.numeric(cm$table)
text(195, 400, res[1], cex=1.6, font=2, col='white')
text(195, 335, res[2], cex=1.6, font=2, col='white')
text(295, 400, res[3], cex=1.6, font=2, col='white')
text(295, 335, res[4], cex=1.6, font=2, col='white')
# add in the specifics
plot(c(100, 0), c(100, 0), type = "n", xlab="", ylab="", main = "DETAILS", xaxt='n', yaxt='n')
text(10, 85, names(cm$byClass[1]), cex=1.2, font=2)
text(10, 70, round(as.numeric(cm$byClass[1]), 3), cex=1.2)
text(30, 85, names(cm$byClass[2]), cex=1.2, font=2)
text(30, 70, round(as.numeric(cm$byClass[2]), 3), cex=1.2)
text(50, 85, names(cm$byClass[5]), cex=1.2, font=2)
text(50, 70, round(as.numeric(cm$byClass[5]), 3), cex=1.2)
text(70, 85, names(cm$byClass[6]), cex=1.2, font=2)
text(70, 70, round(as.numeric(cm$byClass[6]), 3), cex=1.2)
text(90, 85, names(cm$byClass[7]), cex=1.2, font=2)
text(90, 70, round(as.numeric(cm$byClass[7]), 3), cex=1.2)
# add in the accuracy information
text(30, 35, names(cm$overall[1]), cex=1.5, font=2)
text(30, 20, round(as.numeric(cm$overall[1]), 3), cex=1.4)
text(70, 35, names(cm$overall[2]), cex=1.5, font=2)
text(70, 20, round(as.numeric(cm$overall[2]), 3), cex=1.4)
}
# Plot the Confusion Matrix
draw_confusion_matrix_K_Means1(Confusion_Matrix_K_Means1)Comments: We can appreciate the K-Means algorithm to have found 2 types of differents tumours in our dataset (Malign and Benign), showing that we do have good seperability from our features. The Accuracy is not quite good, again the clustering is not aimed at being good in predictions but rather show some insights about the dataset mixed with some knowledge in the field.
K-Means Clustering with only Malign Tumors - Model 2
Since Cancerous Tumors in the Breasts are not all equal, some of them being at different stages or type, we could apply the K-Means Clustering Model to find out if there is some separations among them and if could suggest an priory number of cluster to further analysis for medical researches.
Types of Breast Cancer - American Society
For examples, we can find this article about how stages are rated:
“In both staging systems, 7 key pieces of information are used:
- The extent (size) of the tumor (T): How large is the cancer? Has it grown into nearby areas?
- The spread to nearby lymph nodes (N): Has the cancer spread to nearby lymph nodes? If so, how many?
- The spread (metastasis) to distant sites (M): Has the cancer spread to distant organs such as the lungs or liver?
- Estrogen Receptor (ER) status: Does the cancer have the protein called an estrogen receptor?
- Progesterone Receptor (PR) status: Does the cancer have the protein called a progesterone receptor?
- HER2 status: Does the cancer make too much of a protein called HER2?
- Grade of the cancer (G): How much do the cancer cells look like normal cells?”
We can see that we lack a lot of information only using this dataset, we could only infer the size of the cancer based on 1 tumor, without nearby information. Thus we will be limited in the clustering method to only size as an information for the stage of the tumor.
set.seed(1)
# Load Library
library(factoextra)
# Labeling Tumors Type as Row Name
Malign_ClusterDF$diagnosis <- factor(Malign_ClusterDF$diagnosis)
# Optimal Number of Clusters
fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "wss")fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "silhouette")fviz_nbclust(Malign_ClusterDF[,-1], kmeans, method = "gap_stat")# Create Clusters
Cluster_Malign_1 <- kmeans(Malign_ClusterDF[,-1], centers = 3, iter.max = 100, nstart = 100)
Cluster_Malign_2 <- kmeans(Malign_ClusterDF[,-1], centers = 2, iter.max = 100, nstart = 100)Comments: All 3 methods don’t converge to the same number of clusters, but we can see that the Elbow Method and Silhouette Method would either say 2 or 3 groups being optimal. The Gap Statistic show no cluster separations from the Malign tumors. We can try separating into 2 and 3 clusters and see the profiling of those groups. (For simplictiy, we only compare mean variables as meaningful measures to intepret our tumors.)
Model 2 with 3 Clusters - Plot
set.seed(1)
# Plotting Clusters of Model 1
fviz_cluster(Cluster_Malign_1, data = Malign_ClusterDF[,-1], main="Cluster Model 2 - Only Malign Tumors", subtitle="with 3 Clusters", labelsize = 0)Comments: Some overlap occurs in this 2D graphs, but considering all dimensions, there is no overlap at all. We can see that we could with human interpretation, see that there would be indeed 3 different clusters in the Malign Tumors. Let’s check the centroid.
Centroid of Model 2 with 3 Clusters
Clusters_Malign_1_Centers <- Cluster_Malign_1$centers
DT::datatable(round(Clusters_Malign_1_Centers,5), caption = "Centroid from Model 2 - 3 Clusters")Comments: We can see that the radius_mean (perimeter_mean and area_mean are quite similar) is indeed being one variable cleary seperating the Malign Tumors into 3 clusters, concavity_mean and compactness_mean as well. smoothness_mean is also seperating clusters from each other. (For simplictiy, we only compare mean variables as meaningful measures to intepret our tumors.)
Cluster Members with 3 Clusters
# Cluster Members
split1<- split(Malign_ClusterDF_No_Scale, Cluster_Malign_1$cluster)
# Split Cluster to Original Malign Not Scaled DF
cluster_1 <- split1$`1`
cluster_2 <- split1$`2`
cluster_3 <- split1$`3`
# Data Table
DT::datatable(cluster_1[,-1], caption = "Cluster 1")DT::datatable(cluster_2[,-1], caption = "Cluster 2 ")DT::datatable(cluster_3[,-1], caption = "Cluster 3")Boxplots for Comparison of radius_mean
# Load Libaries
library(ggpubr)
library(ggplot2)
# Boxplot with ggplot
boxcluster1 <- ggplot(cluster_1) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 1",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
boxcluster2 <- ggplot(cluster_2) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 2",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
boxcluster3 <- ggplot(cluster_3) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 3",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
ggarrange1 <- ggarrange(boxcluster1,boxcluster2,boxcluster3, ncol = 3)
annotate_figure(ggarrange1,
top = text_grob("Boxplot for radius_mean Among Clusters", color = "black", face = "bold", size = 14))Comments: We can see some tumors being greater than 20mm or between 15mm and 20mm, and lastly under 15mm. Such result in the clusters median is very intersting and knowing how the staging system is done can actually lead us to prefer the clustering into 2 groups: 1 cluster being below 20mm and the other greater or equal than 20mm. We will do as such in the following part.
Model 2 with 2 Clusters - Plot
set.seed(1)
# Plotting Clusters of Model 1
fviz_cluster(Cluster_Malign_2, data = Malign_ClusterDF[,-1], main="Cluster Model 2 - Only Malign Tumors", subtitle="with 2 Clusters", labelsize = 0)Comments: Overlapping still occurs in such 2d graphs, but we can see also the trend of 2 groups, the Red one being more spread than the blue one, and some spread also happen in the bottom center of the plot for the blue cluster.
Centroid of Model 2 with 2 Clusters
Clusters_Malign_2_Centers <- Cluster_Malign_2$centers
DT::datatable(round(Clusters_Malign_2_Centers,5), caption = "Centroid from Model 2 - 2 Clusters")Comments: We can also see the radius_mean (perimeter_mean and area_mean) being very important in the separation, compactness_mean and concavity_mean as well. symmetry_mean is also quite different and smoothness_mean as well.
Cluster Members with 2 Clusters
# Cluster Members
split2<- split(Malign_ClusterDF_No_Scale, Cluster_Malign_2$cluster)
# Split Cluster to Original Malign Not Scaled DF
cluster2_1 <- split2$`1`
cluster2_2 <- split2$`2`
# Data Table
DT::datatable(cluster2_1[,-1], caption = "Cluster 1")DT::datatable(cluster2_2[,-1], caption = "Cluster 2")Boxplots for Comparison of radius_mean
# Load Libaries
library(ggpubr)
library(ggplot2)
# Boxplot with ggplot
boxcluster1 <- ggplot(cluster_1) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 1 (T2)",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
boxcluster2 <- ggplot(cluster_2) +
aes(x = "", y = radius_mean) +
geom_boxplot(fill = "#1c6155") +
labs(title = "Cluster 2 (T1)",
subtitle = "in millimiters") +
theme_minimal() + ylim(10, 30)
ggarrange2 <- ggarrange(boxcluster1,boxcluster2)
annotate_figure(ggarrange2,
top = text_grob("Boxplot for radius_mean Among Clusters", color = "black", face = "bold", size = 14))Comments: without having the full key pieces information for the staging systems from the American Cancer Society, we can already have some metrics for the T key which is the size of the tumor, but without the nearby areas. The dataset suggest that the measure are for primary tumors only. If we look at the Cluster 1, the radius_mean median seems to be higher than 2cm or 20mm but less than 5cm or 50mm. Thus we would attribute the T2 key to this Cluster. Cluster 2 in opposite is having an median close to 1.4cm or 14mm, since this is less than 2cm ro 20mm, we could attribute the key T1 to this cluster. Nevertheless, we should remember that some member of Cluster 1 are less than 20mm, and thus we shouldn’t categorize them as T2 following the guidelines. For simplicity, we will keep those tumors in the Cluster 1 but if we wanted to decide or not wether a member is subject to T2, we should use other metrics to check the size exactitude before removing it to the T2 label.
Cluster 2 with T1 could potential lead us to such Stages:
Stage IA: The tumor is small, invasive, and has not spread to the lymph nodes (T1, N0, M0). Stage IB: Cancer has spread to the lymph nodes and the cancer in the lymph node is larger than 0.2 mm but less than 2 mm in size. There is either no evidence of a tumor in the breast or the tumor in the breast is 20 mm or smaller (T0 or T1, N1mi, M0). Stage IIIC: A tumor of any size that has spread to 10 or more axillary lymph nodes, the internal mammary lymph nodes, and/or the lymph nodes under the collarbone. It has not spread to other parts of the body (any T, N3, M0). Stage IV (metastatic): The tumor can be any size and has spread to other organs, such as the bones, lungs, brain, liver, distant lymph nodes, or chest wall (any T, any N, M1). Metastatic cancer found when the cancer is first diagnosed occurs about 6% of the time. This may be called de novo metastatic breast cancer. Most commonly, metastatic breast cancer is found after a previous diagnosis of early stage breast cancer.
Cluster 1 with T2 could potential lead us to such Stages:
Stage IIA: Any 1 of these conditions: The tumor is larger than 20 mm but not larger than 50 mm and has not spread to the axillary lymph nodes (T2, N0, M0). Stage IIB: The tumor is larger than 20 mm but not larger than 50 mm and has spread to 1 to 3 axillary lymph nodes (T2, N1, M0). Stage IIIA: The tumor of any size has spread to 4 to 9 axillary lymph nodes or to internal mammary lymph nodes. It has not spread to other parts of the body (T0, T1, T2, or T3; N2; M0). Stage IIIC: A tumor of any size that has spread to 10 or more axillary lymph nodes, the internal mammary lymph nodes, and/or the lymph nodes under the collarbone. It has not spread to other parts of the body (any T, N3, M0). Stage IV (metastatic): The tumor can be any size and has spread to other organs, such as the bones, lungs, brain, liver, distant lymph nodes, or chest wall (any T, any N, M1). Metastatic cancer found when the cancer is first diagnosed occurs about 6% of the time. This may be called de novo metastatic breast cancer. Most commonly, metastatic breast cancer is found after a previous diagnosis of early stage breast cancer.
Comments: We can see that we lack a lot of information to actually get to the actual stage of the cancerous breast tumors, depending on the source, we may lack 2 more information if we follow Cancer.Net staging system: Node (N - Has the tumor spread to the lymph nodes? If so, where, what size, and how many?) or Metastasis (M - Has the cancer spread to other parts of the body?). The American Cancer Society requires way more information, up to 7 in total plus additional recurrence test. Here is the 7 keys parameters:
image
Proportions of T1 and T2 - Pie Chart
# Computing Proportions of T1 and T2
Proportions_T1 <- nrow(cluster2_2)/nrow(Malign_ClusterDF_No_Scale)
Proportions_T2 <- nrow(cluster2_1)/nrow(Malign_ClusterDF_No_Scale)
# Rounding Proportions
Proportions_T1 <- round(Proportions_T1,3)
Proportions_T2 <- round(Proportions_T2,3)
# Pie Chart Dataframe
Pie_T1_T2 <- data.frame(
t = c("T1", "T2"),
n = c(151, 61),
prop = c(Proportions_T1, Proportions_T2))
# Pie ggplot
ggplot(Pie_T1_T2, aes(x="", y=n, fill=t)) +
geom_bar(stat="identity", width=1) +
coord_polar("y", start=0) + theme_void() + geom_text(aes(label = paste0(100*prop, "%")), position = position_stack(vjust=0.5), color="white", size=6) +
labs(x = NULL, y = NULL, fill = "T Category") + scale_fill_manual(values=c("#1c6155","#66807b")) + ggtitle("Pie Chart of T Category Proportions for Malign Tumors (221 obs.)")Comments:
Conclusion
References
Logistic Regression in Machine Learning
Convergence Error in Logistic Regression
Penalized Logistic Regression Essentials in R: Ridge, Lasso and Elastic Net
Lasso Regression in R (Step-by-Step)
How to create a ROC curve in R
How to choose the number of hidden layers and nodes in a feedforward neural network?
Introduction to Neural Networks for Java (second edition) by Jeff Heaton - Google Books
Do we need to set training set and testing set for clustering?
K-means Clustering: Algorithm, Applications, Evaluation Methods, and Drawbacks
Types of Breast Cancer - American Society
Breast Cancer Stages - cancer.org
Breast Cancer: Stages - Cancer.Net
Classification: LDA and QDA Approaches